/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.fetcher;
import net.nutch.net.protocols.Response;
import net.nutch.pagedb.FetchListEntry;
import net.nutch.io.*;
import net.nutch.db.*;
import net.nutch.util.*;
import net.nutch.util.RobotsMetaProcessor.*;
import net.nutch.util.DOMContentUtils;
import java.io.*;
import java.net.*;
import java.util.*;
import java.util.logging.Logger;
import java.util.logging.Level;
import java.util.logging.Handler;
import org.cyberneko.html.parsers.*;
import org.xml.sax.*;
import org.w3c.dom.*;
import org.w3c.dom.html.*;
import org.apache.html.dom.*;
/**
* This class is a worker thread which polls the RequestScheduler for
* finished requests and performs the title, link and plaintext
* extraction, then writes the results to the appropriate DBs.
*/
public class OutputThread extends Thread implements FetcherConstants {
public static final Logger LOG =
LogFormatter.getLogger("net.nutch.fetcher.OutputThread");
public static final int NO_OUTPUT_DELAY_MS= 2 * 1000;
private RequestScheduler scheduler;
private DOMFragmentParser parser;
private RobotsMetaIndicator robotsMeta;
private ArrayFile.Writer fetcherDb; // the output
private ArrayFile.Writer rawDb;
private ArrayFile.Writer strippedDb;
private class UnhandledContentTypeException extends Exception {
String contentType;
UnhandledContentTypeException(String contentType) {
this.contentType= contentType;
}
String getContentType() {
return contentType;
}
}
// used to indicate that an unexpected error came out of the DOM
// parser
private class DOMErrorException extends Exception {
}
/**
* Creates a new <code>OutputThread</code>, which will poll the
* given <code>scheduler</code> for output, and write output records
* to the supplied databases.
*/
protected OutputThread(RequestScheduler scheduler,
ArrayFile.Writer fetcherDb,
ArrayFile.Writer rawDb,
ArrayFile.Writer strippedDb) {
this.scheduler= scheduler;
this.fetcherDb= fetcherDb;
this.rawDb= rawDb;
this.strippedDb= strippedDb;
this.parser = new DOMFragmentParser();
this.robotsMeta= new RobotsMetaIndicator();
}
/**
* Polls the {@link RequestScheduler} for output to service,
* until {@link RequestScheduler#finishedOutput()} returns
* <code>true</code>.
*/
public void run() {
RequestRecord prevOutputRequest= null;
String prevUrlString= null;
while (!scheduler.finishedOutput()) {
RequestRecord outputEntry= null;
try {
outputEntry= scheduler.returnOutputAndGetNext(prevOutputRequest,
prevUrlString);
prevOutputRequest= null;
prevUrlString= null;
} catch (Exception e) {
e.printStackTrace();
LOG.severe("Exception caught during call to"
+ " RequestScheduler.getNextOutput()!");
}
if (outputEntry == null) {
try {
Thread.sleep(NO_OUTPUT_DELAY_MS);
} catch (InterruptedException e) {
;
}
continue;
}
String urlString;
if (outputEntry.getURL() != null) {
urlString= outputEntry.getURLString();
} else
urlString=
outputEntry.getFetchListEntry().getPage().getURL().toString();
if (LOG.isLoggable(Level.FINER))
LOG.finer("going to write " + urlString );
try {
if ( (outputEntry.getResponse() != null)
&& (!outputEntry.getHasFailed()) ) {
handleFetch(outputEntry.getURL(), outputEntry.getFetchListEntry(),
outputEntry.getResponse());
} else {
int status = (!outputEntry.getHasFailed()
&& !outputEntry.getFetchListEntry().getFetch()) ?
FetcherOutput.SUCCESS :
FetcherOutput.RETRY;
handleNoFetch(outputEntry.getFetchListEntry(), status);
}
} catch (org.w3c.dom.DOMException e) {
outputEntry.setOutputStatus(OUT_DOM_ERROR);
} catch (UnhandledContentTypeException e) {
outputEntry.setOutputStatus(OUT_UNKNOWN_CONTENT);
outputEntry.setOutputStatusMessages(
new String[] { e.getContentType() } );
} catch (DOMErrorException e) {
outputEntry.setOutputStatus(OUT_DOM_EXCEPTION);
} catch (Exception e) {
outputEntry.setOutputStatus(OUT_UNKNOWN);
outputEntry.setOutputStatusMessages(
new String[] { e.toString() } );
}
// fixme: ought to output an error to the DBs
// to prevent immediate re-fetch!
prevOutputRequest= outputEntry;
prevUrlString= urlString;
}
scheduler.returnOutputAndGetNext(prevOutputRequest,
prevUrlString);
}
private void handleFetch(URL url, FetchListEntry fle,
Response response)
throws IOException, SAXException, UnhandledContentTypeException,
DOMErrorException {
String contentType = response.getHeader("Content-Type");
if (contentType != null && !contentType.startsWith("text/html"))
throw new UnhandledContentTypeException(contentType);
DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
try {
parser.parse(new InputSource // parse content
(new ByteArrayInputStream(response.getContent())),
node);
} catch (org.w3c.dom.DOMException e) {
// expect this, rethrow
throw e;
} catch (Exception e) {
throw new DOMErrorException();
}
RobotsMetaProcessor.getRobotsMetaDirectives(robotsMeta, node, url);
String text;
String title;
if (robotsMeta.getNoIndex() == false) {
StringBuffer sb = new StringBuffer();
DOMContentUtils.getText(sb, node);
text = sb.toString();
sb.setLength(0);
DOMContentUtils.getTitle(sb, node);
title = sb.toString().trim();
} else {
FetcherStatus.logTraceMisc(FetcherStatus.MISC_META_NOINDEX, url);
text= "";
title= "";
}
Outlink[] outlinks;
if (robotsMeta.getNoFollow() == false) {
URL baseURL= robotsMeta.getBaseHref();
if (baseURL == null)
baseURL= url;
ArrayList l = new ArrayList();
DOMContentUtils.getOutlinks(baseURL, l, node);
outlinks = (Outlink[])l.toArray(new Outlink[l.size()]);
LOG.fine("found " + outlinks.length + " outlinks in " + url);
} else {
outlinks = new Outlink[0];
FetcherStatus.logTraceMisc(FetcherStatus.MISC_META_NOFOLLOW, url);
}
byte[] content;
if (robotsMeta.getNoCache() == false) {
content= response.getContent();
} else {
FetcherStatus.logTraceMisc(FetcherStatus.MISC_META_NOCACHE, url);
content= new byte[0];
}
outputPage(new FetcherOutput(fle, MD5Hash.digest(response.getContent()),
FetcherOutput.SUCCESS, title, outlinks),
new FetcherContent(content),
new FetcherText(text));
}
private void handleNoFetch(FetchListEntry fle, int status) {
outputPage(new FetcherOutput(fle, MD5Hash.digest(fle.getPage().getURL().toString()),
status, "", new Outlink[0]),
new FetcherContent(new byte[0]),
new FetcherText(""));
}
private void outputPage(FetcherOutput fo, FetcherContent raw,
FetcherText stripped) {
try {
synchronized (fetcherDb) {
fetcherDb.append(fo);
rawDb.append(raw);
strippedDb.append(stripped);
}
} catch (Throwable t) {
LOG.severe("error writing output:" + t.toString());
}
}
}